suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(stringr))Drive BC Network Analysis
Data Loading and Initial Cleaning
# Function to standardize attribute names and handle event type variations across different years
standardize_name <- function(x) {
x <- x %>%
tolower() %>%
gsub("_", "", .) %>%
gsub(" ", "", .) %>%
trimws()
}
# Function to load and clean data for a given year
load_and_clean_data <- function(file_path) {
data <- read_csv(file_path, show_col_types = FALSE)
# Check if this is pre-2018 or post-2018 format
if("cause" %in% names(data)) {
# Pre-2018 format
data <- data %>%
rename(
EVENT_TYPE = type,
AREA_NAME = district,
SEVERITY = severity,
START_DATETIME = localupdatetime
)
}
# Standardize values and convert types
data <- data %>%
mutate(
EVENT_TYPE = standardize_name(EVENT_TYPE),
AREA_NAME = standardize_name(AREA_NAME),
SEVERITY = standardize_name(SEVERITY),
START_DATETIME = parse_date_time(START_DATETIME,
orders = c("ymd HMS", "mdy HMS", "dmy HMS",
"ymd HM", "mdy HM", "dmy HM",
"ymd", "mdy", "dmy"))
) %>%
filter(!is.na(EVENT_TYPE)) %>%
filter(!EVENT_TYPE %in% c("planned"))
# the columns we need
data <- data %>%
select(EVENT_TYPE, AREA_NAME, SEVERITY, START_DATETIME) %>%
drop_na(START_DATETIME)
return(data)
}# all the years apply the cleaning functions
data_files <- list.files("../data", pattern = "drivebceventshist.*\\.csv", full.names = TRUE)
all_data <- lapply(data_files, load_and_clean_data)
drivebc_data <- bind_rows(all_data)
drivebc_data <- drivebc_data %>%
mutate(
EVENT_TYPE = as.factor(EVENT_TYPE),
AREA_NAME = as.factor(AREA_NAME),
SEVERITY = as.factor(SEVERITY)
)
str(drivebc_data)tibble [2,748,950 × 4] (S3: tbl_df/tbl/data.frame)
$ EVENT_TYPE : Factor w/ 7 levels "construction",..: 2 2 2 2 2 2 2 2 2 2 ...
$ AREA_NAME : Factor w/ 13 levels "bulkleystikinedistrict",..: 4 4 4 4 13 9 9 13 8 8 ...
$ SEVERITY : Factor w/ 3 levels "major","minor",..: 3 3 3 3 3 3 3 3 3 3 ...
$ START_DATETIME: POSIXct[1:2748950], format: "2006-01-03 08:06:50" "2006-02-13 16:03:49" ...
head(drivebc_data)Time/Event Bipartite Graphs
These graphs show the relationship between the areas and the types of events that occurred in each year. The areas are represented by circles, and the events are represented by squares. An edge between an area and an event indicates that an event of that TYPE occurred in that area. This is useful as it shows what events have occurred in each area over time which is good for the initial analysis of the data as we can see if all events are occurring in all areas.
create_bipartite_graph <- function(year_data, year) {
area_names <- unique(year_data$AREA_NAME)
active_events <- unique(year_data$EVENT_TYPE)
# areas and events as nodes
nodes <- data.frame(
name = c(as.character(area_names), as.character(active_events)),
type = c(rep(TRUE, length(area_names)),
rep(FALSE, length(active_events)))
)
# edge exists if a particular type of event occurred in a particular area
edges <- year_data %>%
select(AREA_NAME, EVENT_TYPE) %>%
distinct()
g <- graph_from_data_frame(d = edges, vertices = nodes, directed = FALSE)
V(g)$color <- ifelse(V(g)$type, "lightblue", "lightgreen")
V(g)$shape <- ifelse(V(g)$type, "circle", "square")
V(g)$size <- 4
V(g)$label.cex <- 0.7
V(g)$label.dist <- .75
V(g)$label.degree <- ifelse(seq_along(V(g)) %% 2 == 0, pi/2, -pi/2)
return(g)
}# graphs for each year
years <- 2006:2023
for(year in years) {
year_data <- drivebc_data %>%
filter(year(START_DATETIME) == year)
if(nrow(year_data) == 0) {
plot.new()
title(main = paste("No Data Available for", year))
next
}
g <- create_bipartite_graph(year_data, year)
plot(g,
layout = layout_as_bipartite,
vertex.label = V(g)$name,
vertex.color = V(g)$color,
vertex.shape = V(g)$shape,
vertex.size = V(g)$size,
vertex.label.cex = V(g)$label.cex,
vertex.label.dist = V(g)$label.dist,
vertex.label.degree = V(g)$label.degree,
edge.width = 0.3,
main = paste("BC Road Events", year))
}